/*
 * Copyright 2009 Nicolai Haehnle.
 * Copyright 2010 Tom Stellard <tstellar@gmail.com>
 * SPDX-License-Identifier: MIT
 */

#include "util/u_math.h"

#include "radeon_dataflow.h"

#include "radeon_compiler.h"
#include "radeon_compiler_util.h"
#include "radeon_list.h"
#include "radeon_swizzle.h"
#include "radeon_variable.h"

struct src_clobbered_reads_cb_data {
   rc_register_file File;
   unsigned int Index;
   unsigned int Mask;
   struct rc_reader_data *ReaderData;
};

typedef void (*rc_presub_replace_fn)(struct rc_instruction *, struct rc_instruction *,
                                     unsigned int);

static struct rc_src_register
chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
{
   struct rc_src_register combine;
   combine.File = inner.File;
   combine.Index = inner.Index;
   combine.RelAddr = inner.RelAddr;
   if (outer.Abs) {
      combine.Abs = 1;
      combine.Negate = outer.Negate;
   } else {
      combine.Abs = inner.Abs;
      combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
      combine.Negate ^= outer.Negate;
   }
   combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
   return combine;
}

static void
copy_propagate_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
{
   rc_register_file file = src->File;
   struct rc_reader_data *reader_data = data;

   if (!rc_inst_can_use_presub(reader_data->C, inst, reader_data->Writer->U.I.PreSub.Opcode,
                               rc_swizzle_to_writemask(src->Swizzle), src,
                               &reader_data->Writer->U.I.PreSub.SrcReg[0],
                               &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
      reader_data->Abort = 1;
      return;
   }

   /* XXX This could probably be handled better. */
   if (file == RC_FILE_ADDRESS) {
      reader_data->Abort = 1;
      return;
   }

   /* R300/R400 is unhappy about propagating
    *  0: MOV temp[1], -none.1111;
    *  1: KIL temp[1];
    * to
    *  0: KIL -none.1111;
    *
    * R500 is fine with it.
    */
   if (!reader_data->C->is_r500 && inst->U.I.Opcode == RC_OPCODE_KIL &&
       reader_data->Writer->U.I.SrcReg[0].File == RC_FILE_NONE) {
      reader_data->Abort = 1;
      return;
   }

   /* These instructions cannot read from the constants file.
    * see radeonTransformTEX()
    */
   if (reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
       reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
       reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_NONE &&
       (inst->U.I.Opcode == RC_OPCODE_TEX || inst->U.I.Opcode == RC_OPCODE_TXB ||
        inst->U.I.Opcode == RC_OPCODE_TXP || inst->U.I.Opcode == RC_OPCODE_TXD ||
        inst->U.I.Opcode == RC_OPCODE_TXL || inst->U.I.Opcode == RC_OPCODE_KIL)) {
      reader_data->Abort = 1;
      return;
   }
}

static void
src_clobbered_reads_cb(void *data, struct rc_instruction *inst, struct rc_src_register *src)
{
   struct src_clobbered_reads_cb_data *sc_data = data;

   if (src->File == sc_data->File && src->Index == sc_data->Index &&
       (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {

      sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
   }

   if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
      sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
   }
}

static void
is_src_clobbered_scan_write(void *data, struct rc_instruction *inst, rc_register_file file,
                            unsigned int index, unsigned int mask)
{
   struct src_clobbered_reads_cb_data sc_data;
   struct rc_reader_data *reader_data = data;
   sc_data.File = file;
   sc_data.Index = index;
   sc_data.Mask = mask;
   sc_data.ReaderData = reader_data;
   rc_for_all_reads_src(reader_data->Writer, src_clobbered_reads_cb, &sc_data);
}

static void
copy_propagate(struct radeon_compiler *c, struct rc_instruction *inst_mov)
{
   struct rc_reader_data reader_data;
   unsigned int i;

   if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
      return;

   /* Get a list of all the readers of this MOV instruction. */
   reader_data.ExitOnAbort = 1;
   rc_get_readers(c, inst_mov, &reader_data, copy_propagate_scan_read, NULL,
                  is_src_clobbered_scan_write);

   if (reader_data.Abort || reader_data.ReaderCount == 0)
      return;

   /* We can propagate SaturateMode if all the readers are MOV instructions
    * without a presubtract operation, source negation and absolute.
    * In that case, we just move SaturateMode to all readers. */
   if (inst_mov->U.I.SaturateMode) {
      for (i = 0; i < reader_data.ReaderCount; i++) {
         struct rc_instruction *inst = reader_data.Readers[i].Inst;

         if (inst->U.I.Opcode != RC_OPCODE_MOV || inst->U.I.SrcReg[0].File == RC_FILE_PRESUB ||
             inst->U.I.SrcReg[0].Abs || inst->U.I.SrcReg[0].Negate) {
            return;
         }
      }
   }

   /* Propagate the MOV instruction. */
   for (i = 0; i < reader_data.ReaderCount; i++) {
      struct rc_instruction *inst = reader_data.Readers[i].Inst;
      *reader_data.Readers[i].U.I.Src =
         chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);

      if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
         inst->U.I.PreSub = inst_mov->U.I.PreSub;
      if (!inst->U.I.SaturateMode)
         inst->U.I.SaturateMode = inst_mov->U.I.SaturateMode;
   }

   /* Finally, remove the original MOV instruction */
   rc_remove_instruction(inst_mov);
}

/**
 * Check if a source register is actually always the same
 * swizzle constant.
 */
static int
is_src_uniform_constant(struct rc_src_register src, rc_swizzle *pswz, unsigned int *pnegate)
{
   int have_used = 0;

   if (src.File != RC_FILE_NONE) {
      *pswz = 0;
      return 0;
   }

   for (unsigned int chan = 0; chan < 4; ++chan) {
      unsigned int swz = GET_SWZ(src.Swizzle, chan);
      if (swz < 4) {
         *pswz = 0;
         return 0;
      }
      if (swz == RC_SWIZZLE_UNUSED)
         continue;

      if (!have_used) {
         *pswz = swz;
         *pnegate = GET_BIT(src.Negate, chan);
         have_used = 1;
      } else {
         if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
            *pswz = 0;
            return 0;
         }
      }
   }

   return 1;
}

/**
 * Replace 0.0, 1.0 and 0.5 immediate constants by their
 * respective swizzles. Simplify instructions like ADD dst, src, 0;
 */
static void
constant_folding(struct radeon_compiler *c, struct rc_instruction *inst)
{
   const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
   unsigned int i;

   /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
   for (unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
      struct rc_constant *constant;
      struct rc_src_register newsrc;
      int have_real_reference;
      unsigned int chan;

      /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
      for (chan = 0; chan < 4; ++chan)
         if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
            break;
      if (chan == 4) {
         inst->U.I.SrcReg[src].File = RC_FILE_NONE;
         continue;
      }

      /* Convert immediates to swizzles. */
      if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT || inst->U.I.SrcReg[src].RelAddr ||
          inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
         continue;

      constant = &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];

      if (constant->Type != RC_CONSTANT_IMMEDIATE)
         continue;

      newsrc = inst->U.I.SrcReg[src];
      have_real_reference = 0;
      for (chan = 0; chan < 4; ++chan) {
         unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
         unsigned int newswz;
         float imm;
         float baseimm;

         if (swz >= 4)
            continue;

         imm = constant->u.Immediate[swz];
         baseimm = imm;
         if (imm < 0.0)
            baseimm = -baseimm;

         if (baseimm == 0.0) {
            newswz = RC_SWIZZLE_ZERO;
         } else if (baseimm == 1.0) {
            newswz = RC_SWIZZLE_ONE;
         } else if (baseimm == 0.5 && c->has_half_swizzles) {
            newswz = RC_SWIZZLE_HALF;
         } else {
            have_real_reference = 1;
            continue;
         }

         SET_SWZ(newsrc.Swizzle, chan, newswz);
         if (imm < 0.0 && !newsrc.Abs)
            newsrc.Negate ^= 1 << chan;
      }

      if (!have_real_reference) {
         newsrc.File = RC_FILE_NONE;
         newsrc.Index = 0;
      }

      /* don't make the swizzle worse */
      if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc))
         continue;

      inst->U.I.SrcReg[src] = newsrc;
   }

   /* In case this instruction has been converted, make sure all of the
    * registers that are no longer used are empty. */
   opcode = rc_get_opcode_info(inst->U.I.Opcode);
   for (i = opcode->NumSrcRegs; i < 3; i++) {
      memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
   }
}

/**
 * If src and dst use the same register, this function returns a writemask that
 * indicates which components are read by src.  Otherwise zero is returned.
 */
static unsigned int
src_reads_dst_mask(struct rc_src_register src, struct rc_dst_register dst)
{
   if (dst.File != src.File || dst.Index != src.Index) {
      return 0;
   }
   return rc_swizzle_to_writemask(src.Swizzle);
}

/* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
 * in any of its channels.  Return 0 otherwise. */
static int
src_has_const_swz(struct rc_src_register src)
{
   int chan;
   for (chan = 0; chan < 4; chan++) {
      unsigned int swz = GET_SWZ(src.Swizzle, chan);
      if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF || swz == RC_SWIZZLE_ONE) {
         return 1;
      }
   }
   return 0;
}

static void
presub_scan_read(void *data, struct rc_instruction *inst, struct rc_src_register *src)
{
   struct rc_reader_data *reader_data = data;
   rc_presubtract_op *presub_opcode = reader_data->CbData;

   if (!rc_inst_can_use_presub(
          reader_data->C, inst, *presub_opcode, reader_data->Writer->U.I.DstReg.WriteMask, src,
          &reader_data->Writer->U.I.SrcReg[0], &reader_data->Writer->U.I.SrcReg[1])) {
      reader_data->Abort = 1;
      return;
   }
}

static int
presub_helper(struct radeon_compiler *c, struct rc_instruction *inst_add,
              rc_presubtract_op presub_opcode, rc_presub_replace_fn presub_replace)
{
   struct rc_reader_data reader_data;
   unsigned int i;
   rc_presubtract_op cb_op = presub_opcode;

   reader_data.CbData = &cb_op;
   reader_data.ExitOnAbort = 1;
   rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL, is_src_clobbered_scan_write);

   if (reader_data.Abort || reader_data.ReaderCount == 0)
      return 0;

   for (i = 0; i < reader_data.ReaderCount; i++) {
      unsigned int src_index;
      struct rc_reader reader = reader_data.Readers[i];
      const struct rc_opcode_info *info = rc_get_opcode_info(reader.Inst->U.I.Opcode);

      for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
         if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
            presub_replace(inst_add, reader.Inst, src_index);
      }
   }
   return 1;
}

static void
presub_replace_add(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
                   unsigned int src_index)
{
   rc_presubtract_op presub_opcode;

   unsigned int negates = 0;
   if (inst_add->U.I.SrcReg[0].Negate)
      negates++;
   if (inst_add->U.I.SrcReg[1].Negate)
      negates++;
   assert(negates != 2 ||
          ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) ==
           (inst_add->U.I.SrcReg[0].Negate & inst_add->U.I.DstReg.WriteMask)));

   if (negates == 1)
      presub_opcode = RC_PRESUB_SUB;
   else
      presub_opcode = RC_PRESUB_ADD;

   if (inst_add->U.I.SrcReg[1].Negate && negates == 1) {
      inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
      inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
   } else {
      inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
      inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
   }
   /* If both sources are negative we can move the negate to the presub. */
   unsigned negate_mask = negates == 1 ? 0 : inst_add->U.I.SrcReg[0].Negate;
   inst_reader->U.I.PreSub.SrcReg[0].Negate = negate_mask;
   inst_reader->U.I.PreSub.SrcReg[1].Negate = negate_mask;
   inst_reader->U.I.PreSub.Opcode = presub_opcode;
   inst_reader->U.I.SrcReg[src_index] =
      chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);
   inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
   inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
}

static int
is_presub_candidate(struct radeon_compiler *c, struct rc_instruction *inst)
{
   const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
   unsigned int i;
   unsigned int is_constant[2] = {0, 0};

   assert(inst->U.I.Opcode == RC_OPCODE_ADD || inst->U.I.Opcode == RC_OPCODE_MAD);

   if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE || inst->U.I.SaturateMode ||
       inst->U.I.WriteALUResult || inst->U.I.Omod) {
      return 0;
   }

   /* If first two sources use a constant swizzle, then we can't convert it to
    * a presubtract operation.  In fact for the ADD and SUB presubtract
    * operations neither source can contain a constant swizzle.  This
    * specific case is checked in peephole_add_presub_add() when
    * we make sure the swizzles for both sources are equal, so we
    * don't need to worry about it here. */
   for (i = 0; i < 2; i++) {
      int chan;
      for (chan = 0; chan < 4; chan++) {
         rc_swizzle swz = get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
         if (swz == RC_SWIZZLE_ONE || swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF) {
            is_constant[i] = 1;
         }
      }
   }
   if (is_constant[0] && is_constant[1])
      return 0;

   for (i = 0; i < info->NumSrcRegs; i++) {
      struct rc_src_register src = inst->U.I.SrcReg[i];
      if (src_reads_dst_mask(src, inst->U.I.DstReg))
         return 0;

      src.File = RC_FILE_PRESUB;
      if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src) &&
          inst->U.I.SrcReg[i].File != RC_FILE_NONE)
         return 0;
   }
   return 1;
}

static int
peephole_add_presub_add(struct radeon_compiler *c, struct rc_instruction *inst_add)
{
   unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
   unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
   unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;

   if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
      return 0;

   /* src0 and src1 can't have absolute values */
   if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
      return 0;

   /* if src0 is negative, at least all bits of dstmask have to be set */
   if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
      return 0;

   /* if src1 is negative, at least all bits of dstmask have to be set */
   if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
      return 0;

   if (!is_presub_candidate(c, inst_add))
      return 0;

   if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
      rc_remove_instruction(inst_add);
      return 1;
   }
   return 0;
}

static void
presub_replace_inv(struct rc_instruction *inst_add, struct rc_instruction *inst_reader,
                   unsigned int src_index)
{
   /* We must be careful not to modify inst_add, since it
    * is possible it will remain part of the program.*/
   inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
   inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
   inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
   inst_reader->U.I.SrcReg[src_index] =
      chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);

   inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
   inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
}

static void
presub_replace_bias(struct rc_instruction *inst_mad, struct rc_instruction *inst_reader,
                    unsigned int src_index)
{
   /* We must be careful not to modify inst_mad, since it
    * is possible it will remain part of the program.*/
   inst_reader->U.I.PreSub.SrcReg[0] = inst_mad->U.I.SrcReg[0];
   inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
   inst_reader->U.I.PreSub.Opcode = RC_PRESUB_BIAS;
   inst_reader->U.I.SrcReg[src_index] =
      chain_srcregs(inst_reader->U.I.SrcReg[src_index], inst_reader->U.I.PreSub.SrcReg[0]);

   inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
   inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_BIAS;
}

/**
 * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
 * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
 * of the add instruction must have the constant 1 swizzle.  This function
 * does not check const registers to see if their value is 1.0, so it should
 * be called after the constant_folding optimization.
 * @return
 * 	0 if the ADD instruction is still part of the program.
 * 	1 if the ADD instruction is no longer part of the program.
 */
static int
peephole_add_presub_inv(struct radeon_compiler *c, struct rc_instruction *inst_add)
{
   unsigned int i, swz;

   if (!is_presub_candidate(c, inst_add))
      return 0;

   /* Check if src0 is 1. */
   /* XXX It would be nice to use is_src_uniform_constant here, but that
    * function only works if the register's file is RC_FILE_NONE */
   for (i = 0; i < 4; i++) {
      if (!(inst_add->U.I.DstReg.WriteMask & (1 << i)))
         continue;

      swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
      if (swz != RC_SWIZZLE_ONE || inst_add->U.I.SrcReg[0].Negate & (1 << i))
         return 0;
   }

   /* Check src1. */
   if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
          inst_add->U.I.DstReg.WriteMask ||
       inst_add->U.I.SrcReg[1].Abs || src_has_const_swz(inst_add->U.I.SrcReg[1])) {

      return 0;
   }

   if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
      rc_remove_instruction(inst_add);
      return 1;
   }
   return 0;
}

/**
 * PRESUB_BIAD: MAD -TEMP[0], 2.0, 1.0
 * Use the presubtract 1 - 2*src0 for all readers of TEMP[0].  The first source
 * of the add instruction must have the constant 1 swizzle.  This function
 * does not check const registers to see if their value is 1.0, so it should
 * be called after the constant_folding optimization.
 * @return
 * 	0 if the MAD instruction is still part of the program.
 * 	1 if the MAD instruction is no longer part of the program.
 */
static int
peephole_mad_presub_bias(struct radeon_compiler *c, struct rc_instruction *inst_mad)
{
   unsigned int i, swz;

   if (!is_presub_candidate(c, inst_mad))
      return 0;

   /* Check if src2 is 1. */
   for (i = 0; i < 4; i++) {
      if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
         continue;

      swz = GET_SWZ(inst_mad->U.I.SrcReg[2].Swizzle, i);
      if (swz != RC_SWIZZLE_ONE || inst_mad->U.I.SrcReg[2].Negate & (1 << i))
         return 0;
   }

   /* Check if src1 is 2. */
   struct rc_src_register src1_reg = inst_mad->U.I.SrcReg[1];
   if ((src1_reg.Negate & inst_mad->U.I.DstReg.WriteMask) != 0 || src1_reg.Abs)
      return 0;
   if (src1_reg.File == RC_FILE_INLINE) {
      if (rc_inline_to_float(src1_reg.Index) != 2.0f)
         return 0;
   } else {
      if (src1_reg.File != RC_FILE_CONSTANT)
         return 0;

      struct rc_constant *constant = &c->Program.Constants.Constants[src1_reg.Index];
      if (constant->Type != RC_CONSTANT_IMMEDIATE)
         return 0;
      for (i = 0; i < 4; i++) {
         if (!(inst_mad->U.I.DstReg.WriteMask & (1 << i)))
            continue;
         swz = GET_SWZ(src1_reg.Swizzle, i);
         if (swz >= RC_SWIZZLE_ZERO || constant->u.Immediate[swz] != 2.0)
            return 0;
      }
   }

   /* Check src0. */
   if ((inst_mad->U.I.SrcReg[0].Negate & inst_mad->U.I.DstReg.WriteMask) !=
          inst_mad->U.I.DstReg.WriteMask ||
       inst_mad->U.I.SrcReg[0].Abs || src_has_const_swz(inst_mad->U.I.SrcReg[0])) {

      return 0;
   }

   if (presub_helper(c, inst_mad, RC_PRESUB_BIAS, presub_replace_bias)) {
      rc_remove_instruction(inst_mad);
      return 1;
   }
   return 0;
}

struct peephole_mul_cb_data {
   struct rc_dst_register *Writer;
   unsigned int Clobbered;
};

static void
omod_filter_reader_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
                      unsigned int index, unsigned int mask)
{
   struct peephole_mul_cb_data *d = userdata;
   if (rc_src_reads_dst_mask(file, mask, index, d->Writer->File, d->Writer->Index,
                             d->Writer->WriteMask)) {

      d->Clobbered = 1;
   }
}

static void
omod_filter_writer_cb(void *userdata, struct rc_instruction *inst, rc_register_file file,
                      unsigned int index, unsigned int mask)
{
   struct peephole_mul_cb_data *d = userdata;
   if (file == d->Writer->File && index == d->Writer->Index && (mask & d->Writer->WriteMask)) {
      d->Clobbered = 1;
   }
}

static int
peephole_mul_omod(struct radeon_compiler *c, struct rc_instruction *inst_mul,
                  struct rc_list *var_list)
{
   unsigned int chan = 0, swz, i;
   int const_index = -1;
   int temp_index = -1;
   float const_value;
   rc_omod_op omod_op = RC_OMOD_DISABLE;
   struct rc_list *writer_list;
   struct rc_variable *var;
   struct peephole_mul_cb_data cb_data;
   unsigned writemask_sum;

   for (i = 0; i < 2; i++) {
      unsigned int j;
      if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT &&
          inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY &&
          inst_mul->U.I.SrcReg[i].File != RC_FILE_NONE) {
         return 0;
      }

      /* The only relevant case with constant swizzles we should check for
       * is multiply by one half.
       */
      if (inst_mul->U.I.SrcReg[i].File == RC_FILE_NONE) {
         for (j = 0; j < 4; j++) {
            swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
            if (swz == RC_SWIZZLE_UNUSED) {
               continue;
            }
            if (swz != RC_SWIZZLE_HALF) {
               return 0;
            } else {
               omod_op = RC_OMOD_DIV_2;
            }
         }
      }

      if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
         if (temp_index != -1) {
            /* The instruction has two temp sources */
            return 0;
         } else {
            temp_index = i;
            continue;
         }
      }
      /* If we get this far Src[i] must be a constant src */
      if (inst_mul->U.I.SrcReg[i].Negate) {
         return 0;
      }
      /* The constant src needs to read from the same swizzle */
      swz = RC_SWIZZLE_UNUSED;
      chan = 0;
      for (j = 0; j < 4; j++) {
         unsigned int j_swz = GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
         if (j_swz == RC_SWIZZLE_UNUSED) {
            continue;
         }
         if (swz == RC_SWIZZLE_UNUSED) {
            swz = j_swz;
            chan = j;
         } else if (j_swz != swz) {
            return 0;
         }
      }

      if (const_index != -1) {
         /* The instruction has two constant sources */
         return 0;
      } else {
         const_index = i;
      }
   }

   if (omod_op == RC_OMOD_DISABLE) {
      if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
                                   inst_mul->U.I.SrcReg[const_index].Index)) {
         return 0;
      }
      const_value = rc_get_constant_value(c, inst_mul->U.I.SrcReg[const_index].Index,
                                          inst_mul->U.I.SrcReg[const_index].Swizzle,
                                          inst_mul->U.I.SrcReg[const_index].Negate, chan);

      if (const_value == 2.0f) {
         omod_op = RC_OMOD_MUL_2;
      } else if (const_value == 4.0f) {
         omod_op = RC_OMOD_MUL_4;
      } else if (const_value == 8.0f) {
         omod_op = RC_OMOD_MUL_8;
      } else if (const_value == (1.0f / 2.0f)) {
         omod_op = RC_OMOD_DIV_2;
      } else if (const_value == (1.0f / 4.0f)) {
         omod_op = RC_OMOD_DIV_4;
      } else if (const_value == (1.0f / 8.0f)) {
         omod_op = RC_OMOD_DIV_8;
      } else {
         return 0;
      }
   }

   writer_list = rc_variable_list_get_writers_one_reader(var_list, RC_INSTRUCTION_NORMAL,
                                                         &inst_mul->U.I.SrcReg[temp_index]);

   if (!writer_list) {
      return 0;
   }

   cb_data.Clobbered = 0;
   cb_data.Writer = &inst_mul->U.I.DstReg;
   for (var = writer_list->Item; var; var = var->Friend) {
      struct rc_instruction *inst;
      const struct rc_opcode_info *info = rc_get_opcode_info(var->Inst->U.I.Opcode);
      if (info->HasTexture) {
         return 0;
      }
      if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
         return 0;
      }

      /* Empirical testing shows that DDX/DDY directly into output
       * with non-identity omod is problematic.
       */
      if ((info->Opcode == RC_OPCODE_DDX || info->Opcode == RC_OPCODE_DDY) &&
          inst_mul->U.I.DstReg.File == RC_FILE_OUTPUT) {
         return 0;
      }

      for (inst = inst_mul->Prev; inst != var->Inst; inst = inst->Prev) {
         rc_for_all_reads_mask(inst, omod_filter_reader_cb, &cb_data);
         rc_for_all_writes_mask(inst, omod_filter_writer_cb, &cb_data);
         if (cb_data.Clobbered) {
            break;
         }
      }
   }

   if (cb_data.Clobbered) {
      return 0;
   }

   writemask_sum = rc_variable_writemask_sum(writer_list->Item);

   /* rc_normal_rewrite_writemask can't expand a previous writemask to store
    * more channels replicated.
    */
   if (util_bitcount(writemask_sum) < util_bitcount(inst_mul->U.I.DstReg.WriteMask))
      return 0;

   /* Rewrite the instructions */
   for (var = writer_list->Item; var; var = var->Friend) {
      struct rc_variable *writer = var;
      unsigned conversion_swizzle = RC_SWIZZLE_UUUU;
      for (chan = 0; chan < 4; chan++) {
         unsigned swz = GET_SWZ(inst_mul->U.I.SrcReg[temp_index].Swizzle, chan);
         if (swz <= RC_SWIZZLE_W)
            SET_SWZ(conversion_swizzle, swz, chan);
      }
      writer->Inst->U.I.Omod = omod_op;
      writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
      writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
      rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
      writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
   }

   rc_remove_instruction(inst_mul);

   return 1;
}

/**
 * @return
 * 	0 if inst is still part of the program.
 * 	1 if inst is no longer part of the program.
 */
int
rc_opt_presubtract(struct radeon_compiler *c, struct rc_instruction *inst, void *data)
{
   switch (inst->U.I.Opcode) {
   case RC_OPCODE_ADD: {
      if (peephole_add_presub_inv(c, inst))
         return 1;
      if (peephole_add_presub_add(c, inst))
         return 1;
      break;
   }
   case RC_OPCODE_MAD: {
      if (peephole_mad_presub_bias(c, inst))
         return 1;
      break;
   }
   default:
      break;
   }
   return 0;
}

static unsigned int
merge_swizzles(unsigned int swz1, unsigned int swz2)
{
   unsigned int new_swz = rc_init_swizzle(RC_SWIZZLE_UNUSED, 0);
   for (unsigned int chan = 0; chan < 4; chan++) {
      unsigned int swz = GET_SWZ(swz1, chan);
      if (swz != RC_SWIZZLE_UNUSED) {
         SET_SWZ(new_swz, chan, swz);
         continue;
      }
      swz = GET_SWZ(swz2, chan);
      SET_SWZ(new_swz, chan, swz);
   }
   return new_swz;
}

/* Sets negate to 0 for unused channels. */
static unsigned int
clean_negate(struct rc_src_register src)
{
   unsigned int new_negate = 0;
   for (unsigned int chan = 0; chan < 4; chan++) {
      unsigned int swz = GET_SWZ(src.Swizzle, chan);
      if (swz != RC_SWIZZLE_UNUSED)
         new_negate |= src.Negate & (1 << chan);
   }
   return new_negate;
}

static unsigned int
merge_negates(struct rc_src_register src1, struct rc_src_register src2)
{
   return clean_negate(src1) | clean_negate(src2);
}

static unsigned int
fill_swizzle(unsigned int orig_swz, unsigned int wmask, unsigned int const_swz)
{
   for (unsigned int chan = 0; chan < 4; chan++) {
      unsigned int swz = GET_SWZ(orig_swz, chan);
      if (swz == RC_SWIZZLE_UNUSED && (wmask & (1 << chan))) {
         SET_SWZ(orig_swz, chan, const_swz);
      }
   }
   return orig_swz;
}

static int
have_shared_source(struct rc_instruction *inst1, struct rc_instruction *inst2)
{
   int shared_src = -1;
   const struct rc_opcode_info *opcode1 = rc_get_opcode_info(inst1->U.I.Opcode);
   const struct rc_opcode_info *opcode2 = rc_get_opcode_info(inst2->U.I.Opcode);
   for (unsigned i = 0; i < opcode1->NumSrcRegs; i++) {
      for (unsigned j = 0; j < opcode2->NumSrcRegs; j++) {
         if (inst1->U.I.SrcReg[i].File == inst2->U.I.SrcReg[j].File &&
             inst1->U.I.SrcReg[i].Index == inst2->U.I.SrcReg[j].Index &&
             inst1->U.I.SrcReg[i].RelAddr == inst2->U.I.SrcReg[j].RelAddr &&
             inst1->U.I.SrcReg[i].Abs == inst2->U.I.SrcReg[j].Abs)
            shared_src = i;
      }
   }
   return shared_src;
}

/**
 * Merges two MOVs writing different channels of the same destination register
 * with the use of the constant swizzles.
 */
static bool
merge_movs(struct radeon_compiler *c, struct rc_instruction *inst, struct rc_instruction *cur)
{
   /* We can merge two MOVs into MOV if one of them is from inline constant,
    * i.e., constant swizzles and RC_FILE_NONE).
    *
    * For example
    *   MOV temp[0].x none.1___
    *   MOV temp[0].y input[0]._x__
    *
    * becomes
    *   MOV temp[0].xy input[0].1x__
    */
   unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
   if (cur->U.I.SrcReg[0].File == RC_FILE_NONE || inst->U.I.SrcReg[0].File == RC_FILE_NONE) {
      struct rc_src_register src;
      if (cur->U.I.SrcReg[0].File == RC_FILE_NONE)
         src = inst->U.I.SrcReg[0];
      else
         src = cur->U.I.SrcReg[0];
      src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);
      src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
      if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
         cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
         cur->U.I.SrcReg[0] = src;
         rc_remove_instruction(inst);
         return true;
      }
   }

   /* Handle the trivial case where the MOVs share a source.
    *
    * For example
    *   MOV temp[0].x const[0].x
    *   MOV temp[0].y const[0].z
    *
    * becomes
    *   MOV temp[0].xy const[0].xz
    */
   if (have_shared_source(inst, cur) == 0) {
      struct rc_src_register src = cur->U.I.SrcReg[0];
      src.Negate = merge_negates(inst->U.I.SrcReg[0], cur->U.I.SrcReg[0]);
      src.Swizzle = merge_swizzles(cur->U.I.SrcReg[0].Swizzle, inst->U.I.SrcReg[0].Swizzle);

      if (c->SwizzleCaps->IsNative(RC_OPCODE_MOV, src)) {
         cur->U.I.DstReg.WriteMask |= orig_dst_wmask;
         cur->U.I.SrcReg[0] = src;
         rc_remove_instruction(inst);
         return true;
      }
   }

   /* Otherwise, we can convert the MOVs into ADD.
    *
    * For example
    *   MOV temp[0].x const[0].x
    *   MOV temp[0].y input[0].y
    *
    * becomes
    *   ADD temp[0].xy const[0].x0 input[0].0y
    */
   unsigned wmask = cur->U.I.DstReg.WriteMask | orig_dst_wmask;
   struct rc_src_register src0 = inst->U.I.SrcReg[0];
   struct rc_src_register src1 = cur->U.I.SrcReg[0];

   src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ZERO);
   src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, RC_SWIZZLE_ZERO);
   if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src0) ||
       !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, src1))
      return false;

   cur->U.I.DstReg.WriteMask = wmask;
   cur->U.I.Opcode = RC_OPCODE_ADD;
   cur->U.I.SrcReg[0] = src0;
   cur->U.I.SrcReg[1] = src1;

   /* finally delete the original mov */
   rc_remove_instruction(inst);
   return true;
}

/**
 * This function will try to merge MOV and ADD/MUL instructions with the same
 * destination, making use of the constant swizzles.
 *
 * For example:
 *   MOV temp[0].x const[0].x
 *   MUL temp[0].yz const[1].yz const[2].yz
 *
 * becomes
 *   MAD temp[0].xyz const[1].0yz const[2].0yz const[0].x00
 */
static int
merge_mov_add_mul(struct radeon_compiler *c, struct rc_instruction *inst1,
                  struct rc_instruction *inst2)
{
   struct rc_instruction *inst, *mov;
   if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
      mov = inst1;
      inst = inst2;
   } else {
      mov = inst2;
      inst = inst1;
   }

   const bool is_mul = inst->U.I.Opcode == RC_OPCODE_MUL;
   int shared_index = have_shared_source(inst, mov);
   unsigned wmask = mov->U.I.DstReg.WriteMask | inst->U.I.DstReg.WriteMask;

   /* If there is a shared source, just merge the swizzles and be done with it. */
   if (shared_index != -1) {
      struct rc_src_register shared_src = inst->U.I.SrcReg[shared_index];
      struct rc_src_register other_src = inst->U.I.SrcReg[1 - shared_index];

      shared_src.Negate = merge_negates(mov->U.I.SrcReg[0], shared_src);
      shared_src.Swizzle = merge_swizzles(shared_src.Swizzle, mov->U.I.SrcReg[0].Swizzle);
      other_src.Negate = clean_negate(other_src);
      unsigned int swz = is_mul ? RC_SWIZZLE_ONE : RC_SWIZZLE_ZERO;
      other_src.Swizzle = fill_swizzle(other_src.Swizzle, wmask, swz);

      if (!c->SwizzleCaps->IsNative(RC_OPCODE_ADD, shared_src) ||
          !c->SwizzleCaps->IsNative(RC_OPCODE_ADD, other_src))
         return 0;

      inst2->U.I.Opcode = inst->U.I.Opcode;
      inst2->U.I.SrcReg[0] = shared_src;
      inst2->U.I.SrcReg[1] = other_src;

      /* TODO: we can do a bit better in the special case when one of the sources is none.
       * Convert to MAD otherwise.
       */
   } else {
      struct rc_src_register src0, src1, src2;
      if (is_mul) {
         src2 = mov->U.I.SrcReg[0];
         src0 = inst->U.I.SrcReg[0];
         src1 = inst->U.I.SrcReg[1];
      } else {
         src0 = mov->U.I.SrcReg[0];
         src1 = inst->U.I.SrcReg[0];
         src2 = inst->U.I.SrcReg[1];
      }
      /* The following login expects that the unused channels have empty negate bits. */
      src0.Negate = clean_negate(src0);
      src1.Negate = clean_negate(src1);
      src2.Negate = clean_negate(src2);

      src0.Swizzle = fill_swizzle(src0.Swizzle, wmask, RC_SWIZZLE_ONE);
      src1.Swizzle = fill_swizzle(src1.Swizzle, wmask, is_mul ? RC_SWIZZLE_ZERO : RC_SWIZZLE_ONE);
      src2.Swizzle = fill_swizzle(src2.Swizzle, wmask, RC_SWIZZLE_ZERO);
      if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src0) ||
          !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src1) ||
          !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src2))
         return 0;

      inst2->U.I.Opcode = RC_OPCODE_MAD;
      inst2->U.I.SrcReg[0] = src0;
      inst2->U.I.SrcReg[1] = src1;
      inst2->U.I.SrcReg[2] = src2;
   }
   inst2->U.I.DstReg.WriteMask = wmask;
   /* finally delete the original instruction */
   rc_remove_instruction(inst1);

   return 1;
}

/**
 * This function will try to merge MOV and MAD instructions with the same
 * destination, making use of the constant swizzles. This only works
 * if there is a shared source or one of the sources is RC_FILE_NONE.
 *
 * For example:
 *   MOV temp[0].x const[0].x
 *   MAD temp[0].yz const[0].yz const[1].yz input[0].xw
 *
 * becomes
 *   MAD temp[0].xyz const[0].xyz const[2].1yz input[0].0xw
 */
static bool
merge_mov_mad(struct radeon_compiler *c, struct rc_instruction *inst1, struct rc_instruction *inst2)
{
   struct rc_instruction *mov, *mad;
   if (inst1->U.I.Opcode == RC_OPCODE_MOV) {
      mov = inst1;
      mad = inst2;
   } else {
      mov = inst2;
      mad = inst1;
   }

   int shared_index = have_shared_source(mad, mov);
   unsigned wmask = mov->U.I.DstReg.WriteMask | mad->U.I.DstReg.WriteMask;
   struct rc_src_register src[3];
   src[0] = mad->U.I.SrcReg[0];
   src[1] = mad->U.I.SrcReg[1];
   src[2] = mad->U.I.SrcReg[2];

   /* Shared source is the one for multiplication. */
   if (shared_index == 0 || shared_index == 1) {
      src[shared_index].Negate = merge_negates(src[shared_index], mov->U.I.SrcReg[0]);
      src[1 - shared_index].Negate = clean_negate(src[1 - shared_index]);
      src[shared_index].Swizzle =
         merge_swizzles(src[shared_index].Swizzle, mov->U.I.SrcReg[0].Swizzle);
      src[1 - shared_index].Swizzle =
         fill_swizzle(src[1 - shared_index].Swizzle, wmask, RC_SWIZZLE_ONE);
      src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);

      /* Shared source is the one for used for addition, or it is none. Additionally,
       * if the mov SrcReg is none, we merge it with the addition (third) reg as well
       * because than we have the highest change the swizzles will be legal.
       */
   } else if (shared_index == 2 || mov->U.I.SrcReg[0].File == RC_FILE_NONE ||
              src[2].File == RC_FILE_NONE) {
      src[2].Negate = merge_negates(src[2], mov->U.I.SrcReg[0]);
      src[2].Swizzle = merge_swizzles(src[2].Swizzle, mov->U.I.SrcReg[0].Swizzle);
      src[0].Swizzle = fill_swizzle(src[0].Swizzle, wmask, RC_SWIZZLE_ZERO);
      src[1].Swizzle = fill_swizzle(src[1].Swizzle, wmask, RC_SWIZZLE_ZERO);
      if (src[2].File == RC_FILE_NONE) {
         src[2].File = mov->U.I.SrcReg[0].File;
         src[2].Index = mov->U.I.SrcReg[0].Index;
         src[2].RelAddr = mov->U.I.SrcReg[0].RelAddr;
         src[2].Abs = mov->U.I.SrcReg[0].Abs;
      }

      /* First or the second MAD source is RC_FILE_NONE, we merge the mov into it,
       * fill the other one with ones and the reg for addition with zeros.
       */
   } else if (src[0].File == RC_FILE_NONE || src[1].File == RC_FILE_NONE) {
      unsigned none_src = src[0].File == RC_FILE_NONE ? 0 : 1;
      src[none_src] = mov->U.I.SrcReg[0];
      src[none_src].Negate = merge_negates(src[none_src], mad->U.I.SrcReg[none_src]);
      src[none_src].Swizzle =
         merge_swizzles(src[none_src].Swizzle, mad->U.I.SrcReg[none_src].Swizzle);
      src[1 - none_src].Negate = clean_negate(src[1 - none_src]);
      src[1 - none_src].Swizzle = fill_swizzle(src[1 - none_src].Swizzle, wmask, RC_SWIZZLE_ONE);
      src[2].Swizzle = fill_swizzle(src[2].Swizzle, wmask, RC_SWIZZLE_ZERO);
   } else {
      return false;
   }

   if (!c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[0]) ||
       !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[1]) ||
       !c->SwizzleCaps->IsNative(RC_OPCODE_MAD, src[2]))
      return false;

   inst2->U.I.Opcode = RC_OPCODE_MAD;
   inst2->U.I.SrcReg[0] = src[0];
   inst2->U.I.SrcReg[1] = src[1];
   inst2->U.I.SrcReg[2] = src[2];
   inst2->U.I.DstReg.WriteMask = wmask;
   rc_remove_instruction(inst1);
   return true;
}

static bool
inst_combination(struct rc_instruction *inst1, struct rc_instruction *inst2, rc_opcode opcode1,
                 rc_opcode opcode2)
{
   return ((inst1->U.I.Opcode == opcode1 && inst2->U.I.Opcode == opcode2) ||
           (inst2->U.I.Opcode == opcode1 && inst1->U.I.Opcode == opcode2));
}

/**
 * Searches for instructions writing different channels of the same register that could
 * be merged together with the use of constant swizzles.
 *
 * The potential candidates are combinations of MOVs, ADDs, MULs and MADs.
 */
static void
merge_channels(struct radeon_compiler *c, struct rc_instruction *inst)
{
   unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
   unsigned int orig_dst_file = inst->U.I.DstReg.File;
   unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;
   const struct rc_opcode_info *orig_opcode = rc_get_opcode_info(inst->U.I.Opcode);

   struct rc_instruction *cur = inst;
   while (cur != &c->Program.Instructions) {
      cur = cur->Next;
      const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);

      /* Keep it simple for now and stop when encountering any
       * control flow.
       */
      if (opcode->IsFlowControl)
         return;

      /* Stop when the original destination is overwritten */
      if (orig_dst_reg == cur->U.I.DstReg.Index && orig_dst_file == cur->U.I.DstReg.File &&
          (orig_dst_wmask & cur->U.I.DstReg.WriteMask) != 0)
         return;

      /* Stop the search when the original instruction destination
       * is used as a source for anything.
       */
      for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
         if (cur->U.I.SrcReg[i].File == orig_dst_file && cur->U.I.SrcReg[i].Index == orig_dst_reg)
            return;
      }

      /* Stop the search when some of the original sources are touched. */
      for (unsigned i = 0; i < orig_opcode->NumSrcRegs; i++) {
         if (inst->U.I.SrcReg[i].File == cur->U.I.DstReg.File &&
             inst->U.I.SrcReg[i].Index == cur->U.I.DstReg.Index)
            return;
      }

      if (cur->U.I.DstReg.File == orig_dst_file && cur->U.I.DstReg.Index == orig_dst_reg &&
          cur->U.I.SaturateMode == inst->U.I.SaturateMode &&
          (cur->U.I.DstReg.WriteMask & orig_dst_wmask) == 0) {

         if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MOV)) {
            if (merge_movs(c, inst, cur))
               return;
         }

         /* Skip the merge if one of the instructions writes just w channel
          * and we are compiling a fragment shader. We can pair-schedule it together
          * later anyway and it will also give the scheduler a bit more flexibility.
          * Only check this after merging MOVs as when we manage to merge two MOVs
          * into another MOV we can still copy propagate it away. So it is a win in
          * that case.
          */
         if (c->has_omod &&
             (cur->U.I.DstReg.WriteMask == RC_MASK_W || inst->U.I.DstReg.WriteMask == RC_MASK_W))
            continue;

         if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_ADD) ||
             inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MUL)) {
            if (merge_mov_add_mul(c, inst, cur))
               return;
         }

         if (inst_combination(cur, inst, RC_OPCODE_MOV, RC_OPCODE_MAD)) {
            if (merge_mov_mad(c, inst, cur))
               return;
         }
      }
   }
}

/**
 * Searches for duplicate ARLs/ARRs
 *
 * Only a very trivial case is now optimized where if a second one is detected which reads from
 * the same register as the first one and source is the same, just remove the second one.
 */
static void
merge_A0_loads(struct radeon_compiler *c, struct rc_instruction *inst, bool is_ARL)
{
   unsigned int A0_src_reg = inst->U.I.SrcReg[0].Index;
   unsigned int A0_src_file = inst->U.I.SrcReg[0].File;
   unsigned int A0_src_swizzle = inst->U.I.SrcReg[0].Swizzle;
   int cf_depth = 0;

   struct rc_instruction *cur = inst;
   while (cur != &c->Program.Instructions) {
      cur = cur->Next;
      const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);

      /* Keep it simple for now and stop when encountering any
       * control flow besides simple ifs.
       */
      if (opcode->IsFlowControl) {
         switch (cur->U.I.Opcode) {
         case RC_OPCODE_IF: {
            cf_depth++;
            break;
         }
         case RC_OPCODE_ELSE: {
            if (cf_depth < 1)
               return;
            break;
         }
         case RC_OPCODE_ENDIF: {
            cf_depth--;
            break;
         }
         default:
            return;
         }
      }

      /* Stop when the original source is overwritten */
      if (A0_src_reg == cur->U.I.DstReg.Index && A0_src_file == cur->U.I.DstReg.File &&
          cur->U.I.DstReg.WriteMask | rc_swizzle_to_writemask(A0_src_swizzle))
         return;

      /* Wrong A0 load type. */
      if ((is_ARL && cur->U.I.Opcode == RC_OPCODE_ARR) ||
          (!is_ARL && cur->U.I.Opcode == RC_OPCODE_ARL))
         return;

      if (cur->U.I.Opcode == RC_OPCODE_ARL || cur->U.I.Opcode == RC_OPCODE_ARR) {
         if (A0_src_reg == cur->U.I.SrcReg[0].Index && A0_src_file == cur->U.I.SrcReg[0].File &&
             A0_src_swizzle == cur->U.I.SrcReg[0].Swizzle) {
            struct rc_instruction *next = cur->Next;
            rc_remove_instruction(cur);
            cur = next;
         } else {
            return;
         }
      }
   }
}

/**
 * Search for instructions where we could copy propagate the constant swizzle
 * to the reader, for example the following sequence
 *
 * SIN temp[1].x, temp[0].x___;
 * MOV temp[1].y, none._1__;
 * ADD temp[2].xyz, temp[1].xy__, const[0].ww__;
 *
 * could be transformed into
 *
 * SIN temp[1].x, temp[0].x___;
 * ADD temp[0].xyz, temp[1].x1__, const[0].ww__;
 */
static void
copy_propagate_constant_swizzle(struct radeon_compiler *c, struct rc_instruction *inst)
{
   unsigned int orig_dst_reg = inst->U.I.DstReg.Index;
   unsigned int orig_dst_wmask = inst->U.I.DstReg.WriteMask;

   struct rc_instruction *cur = inst;
   while (cur != &c->Program.Instructions) {
      cur = cur->Next;
      const struct rc_opcode_info *opcode = rc_get_opcode_info(cur->U.I.Opcode);

      /* Keep it simple for now and stop when encountering any control flow. */
      if (opcode->IsFlowControl)
         break;

      /* We should not see the original destination overwritten (within a control flow block).
       * If that happens, we don't have SSA-like anymore and several later assumptions would
       * be broken. */
      assert(orig_dst_reg != cur->U.I.DstReg.Index || RC_FILE_TEMPORARY != cur->U.I.DstReg.File ||
             (orig_dst_wmask & cur->U.I.DstReg.WriteMask) == 0);

      for (unsigned int src = 0; src < opcode->NumSrcRegs; src++) {
         if (cur->U.I.SrcReg[src].File == RC_FILE_TEMPORARY &&
             cur->U.I.SrcReg[src].Index == orig_dst_reg &&
             rc_swizzle_to_writemask(cur->U.I.SrcReg[src].Swizzle) & orig_dst_wmask) {

            /* Construct the new swizzle. */
            unsigned new_swizzle = cur->U.I.SrcReg[src].Swizzle;
            unsigned negate = 0;
            for (unsigned chan = 0; chan < 4; chan++) {
               unsigned swz = GET_SWZ(new_swizzle, chan);
               unsigned swz_to_propagate = GET_SWZ(inst->U.I.SrcReg[0].Swizzle, swz);
               if (swz_to_propagate > RC_SWIZZLE_W && swz_to_propagate < RC_SWIZZLE_UNUSED) {
                  SET_SWZ(new_swizzle, chan, swz_to_propagate);
                  negate |= inst->U.I.SrcReg[0].Negate & (1 << swz);
               }
            }
            struct rc_src_register new_src = cur->U.I.SrcReg[src];
            new_src.Swizzle = new_swizzle;
            new_src.Negate ^= negate;
            if (!c->SwizzleCaps->IsNative(cur->U.I.Opcode, new_src))
               continue;

            cur->U.I.SrcReg[src] = new_src;
         }
      }
   }

   /* Now check if we can delete the original MOV. Specifically if we still have some
    * readers left.
    */
   struct rc_reader_data readers;
   readers.ExitOnAbort = 0;
   rc_get_readers(c, inst, &readers, NULL, NULL, NULL);
   if (readers.ReaderCount == 0)
      rc_remove_instruction(inst);
}

/**
 * According to the GLSL spec, round is only 1.30 and up
 * so the only reason why we should ever see round is if it actually
 * is lowered ARR (from nine->ttn). In that case we want to reconstruct
 * the ARR instead of lowering the round.
 */
static void
transform_vertex_ROUND(struct radeon_compiler *c, struct rc_instruction *inst)
{
   struct rc_reader_data readers;
   readers.ExitOnAbort = 0;
   rc_get_readers(c, inst, &readers, NULL, NULL, NULL);

   assert(readers.ReaderCount > 0);
   for (unsigned i = 0; i < readers.ReaderCount; i++) {
      struct rc_instruction *reader = readers.Readers[i].Inst;
      if (reader->U.I.Opcode != RC_OPCODE_ARL) {
         assert(!"Unable to convert ROUND+ARL to ARR\n");
         return;
      }
   }

   /* Only ARL readers, convert all to ARR */
   for (unsigned i = 0; i < readers.ReaderCount; i++) {
      readers.Readers[i].Inst->U.I.Opcode = RC_OPCODE_ARR;
   }
   /* Switch ROUND to MOV and let copy propagate sort it out later. */
   inst->U.I.Opcode = RC_OPCODE_MOV;
}

/**
 * Apply various optimizations specific to the A0 address register loads.
 */
static void
optimize_A0_loads(struct radeon_compiler *c)
{
   struct rc_instruction *inst = c->Program.Instructions.Next;

   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      if (cur->U.I.Opcode == RC_OPCODE_ARL) {
         merge_A0_loads(c, cur, true);
      } else if (cur->U.I.Opcode == RC_OPCODE_ARR) {
         merge_A0_loads(c, cur, false);
      } else if (cur->U.I.Opcode == RC_OPCODE_ROUND) {
         transform_vertex_ROUND(c, cur);
      }
   }
}

void
rc_optimize(struct radeon_compiler *c, void *user)
{
   struct rc_instruction *inst = c->Program.Instructions.Next;
   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      constant_folding(c, cur);
   }

   /* Copy propagate simple movs away. */
   inst = c->Program.Instructions.Next;
   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      if (cur->U.I.Opcode == RC_OPCODE_MOV) {
         copy_propagate(c, cur);
      }
   }

   if (c->type == RC_VERTEX_PROGRAM) {
      optimize_A0_loads(c);
   }

   /* Merge MOVs to same source in different channels using the constant
    * swizzle.
    */
   if (c->is_r500 || c->type == RC_VERTEX_PROGRAM) {
      inst = c->Program.Instructions.Next;
      while (inst != &c->Program.Instructions) {
         struct rc_instruction *cur = inst;
         inst = inst->Next;
         if (cur->U.I.Opcode == RC_OPCODE_MOV || cur->U.I.Opcode == RC_OPCODE_ADD ||
             cur->U.I.Opcode == RC_OPCODE_MAD || cur->U.I.Opcode == RC_OPCODE_MUL)
            merge_channels(c, cur);
      }
   }

   /* Copy propagate few extra movs from the merge_channels pass. */
   inst = c->Program.Instructions.Next;
   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      if (cur->U.I.Opcode == RC_OPCODE_MOV) {
         copy_propagate(c, cur);
      }
   }

   /* Copy propagate constant swizzles */
   inst = c->Program.Instructions.Next;
   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      if (cur->U.I.Opcode == RC_OPCODE_MOV && cur->U.I.SrcReg[0].File == RC_FILE_NONE &&
          cur->U.I.DstReg.File == RC_FILE_TEMPORARY) {
         copy_propagate_constant_swizzle(c, cur);
      }
   }

   if (c->type != RC_FRAGMENT_PROGRAM) {
      return;
   }

   /* Output modifiers. */
   inst = c->Program.Instructions.Next;
   struct rc_list *var_list = NULL;
   while (inst != &c->Program.Instructions) {
      struct rc_instruction *cur = inst;
      inst = inst->Next;
      if (cur->U.I.Opcode == RC_OPCODE_MUL) {
         if (!var_list)
            var_list = rc_get_variables(c);
         if (peephole_mul_omod(c, cur, var_list))
            var_list = NULL;
      }
   }
}
